# -------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
# -------------------------------------------------------------

# Autogenerated By   : src/main/python/generator/generator.py
# Autogenerated From : scripts/builtin/quantizeByCluster.dml

from typing import Dict, Iterable

from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
from systemds.utils.consts import VALID_INPUT_TYPES


def quantizeByCluster(X: Matrix,
                      **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
     The quantizeByCluster-function implements product quantization. Initially, it
     divides the original vector space into M subspaces. The resulting lower dimensional
     subvectors are then quantized. If the column count is not divisible by the number of
     subspaces M, the data is padded with zeros. Optimal space decomposition can be
     computed, when the data follows a Gaussian distribution. The function uses kmeans for
     quantizing and svd to compute the space decomposition.
    
    
    
    :param X: The input matrix to perform product quantization on
    :param M: Number of subspaces
    :param k: Number of vectors in the subcodebooks
    :param runs: Number of runs (with different initial centroids)
    :param max_iter: Maximum number of iterations per run
    :param eps: Tolerance (epsilon) for WCSS change ratio
    :param avg_sample_size_per_centroid: Average number of records per centroid in data samples
    :param separate: Cluster subspaces separately. If value is set to true,
        kmeans is run M times, once for each subspace. Otherwise
        kmeans is run only once.
    :param space_decomp: Decompose the vector space by multiplying the input
        matrix X with an orthogonal matrix R. Assumes the data
        follows a parametric Gaussian distribution.
        Time complexity in O(nrow(X)^2 * min(nrow(X), ncol(X))).
    :param seed: The seed used for initial sampling. If set to -1 random
        seeds are selected.
    :return: The matrix containing the centroids. If clustered separately, the ith
        subcodebook is the ith chunk of size k. The codebook matrix has the dimensions
        [k*M x ncol(X)/M].
    :return: The mapping of vectors to centroids. Each vector of the input matrix X is mapped
        onto a vector of codes. The entries in the codes matrix are the indices of
        the vectors in the codebook. The codes matrix has the dimensions [nrow(X) x M].
    :return: The orthogonal matrix R which is applied to the input matrix X before performing
        the product quantization. Only relevant when space_decomp = TRUE.
    """

    params_dict = {'X': X}
    params_dict.update(kwargs)
    
    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Matrix(X.sds_context, '')
    output_nodes = [vX_0, vX_1, vX_2, ]

    op = MultiReturn(X.sds_context, 'quantizeByCluster', output_nodes, named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
